# Load packages
library(readr)
library(stringr)
# Set seed
set.seed(1996)
#############################
rm(list=setdiff(ls(), c('script', 'scripts', 'log_file')))
#############################
# Load Issue Statements
issue_statements <- readRDS('issue_statements2018_2022.rds')

colnames(issue_statements)
# Create Preprocessing Function
preprocess_text <- function(text) {
  # Make lowercase
  text <- tolower(text)
  # Convert important compound string patterns for consistent tokenization
  text <- str_replace_all(text, 'medicare for all', 'medicareforall')
  text <- str_replace_all(text, 'medicare-for-all', 'medicareforall')
  text <- str_replace_all(text, 'single-payer', 'singlepayer')
  text <- str_replace_all(text, 'single payer', 'singlepayer')

  text <- str_replace_all(text, 'pro choice', 'prochoice')
  text <- str_replace_all(text, 'pro-choice', 'prochoice')
  text <- str_replace_all(text, 'pro life', 'prolife')
  text <- str_replace_all(text, 'pro-life', 'prolife')
  text <- str_replace_all(text, '100%', 'one hundred percent')

  text <- str_replace_all(text, 'k-12', 'ktwelve')
  text <- str_replace_all(text, 'k12', 'ktwelve')
  text <- str_replace_all(text, 'pre-k', 'prek')
  text <- str_replace_all(text, '4-year', 'four year')
  text <- str_replace_all(text, '4 year', 'four year')
  text <- str_replace_all(text, '2 year', 'two year')
  
  text <- str_replace_all(text, '2nd amendment', 'second amendment')
  text <- str_replace_all(text, '2a', 'second amendment')
  
  text <- str_replace_all(text, 'bi-partisan', 'bipartisan')
  text <- str_replace_all(text, 'non-partisan', 'nonpartisan')
  
  # Replace hyphen with space
  text <- str_replace_all(text, '-', ' ')
  
  # Remove non alphabetic characters 
  text <- gsub("[^a-zA-Z ]", "", text)
  text <- gsub("\\b(?!a\\b|i\\b)\\w\\b", "", text, perl = TRUE)
  text <- trimws(text)
  return(text)
}

# Pre-process issue statement text and issue headers
issue_statements$issue_text <- preprocess_text(issue_statements$issue_text)
issue_statements$issue_header <- preprocess_text(issue_statements$issue_header)

# Merge Issue Text with Issue Header for incorporating into estimation
issue_statements$issue_final <- paste(issue_statements$issue_header, 
                                      issue_statements$issue_text, sep = ' ')

# Create Variable for Embedding
issue_statements$candidate_id <- paste(issue_statements$candidate, 
                                       issue_statements$state_postal, 
                                       issue_statements$cd, 
                                       issue_statements$year)

# Count average number of words in issue text 
mean(str_count(issue_statements$issue_text, '\\w+'))

# Save CSV for embeddings model (main)
write.csv(issue_statements, 'Model_text.csv')

#############################
# Validity Check Emphasis versus positions (Appendix C)
#############################
# Extract Blumenauer's issue statement on healthcare
temp1 <- issue_statements[issue_statements$candidate == 'Blumenauer, Earl' & 
                            issue_statements$year == 2020 &
                            issue_statements$issue_header == 'universal affordable health care',]

# Create separate candidate ID
temp1$candidate_id <- 'Blumenauer, Earl OR 3 2020 Healthcare'

# Extract Duncans's issue statement on immigration
temp2 <- issue_statements[issue_statements$candidate == 'Jeff Duncan' & 
                            issue_statements$year == 2022 &
                            issue_statements$issue_header == 'immigration',]
# Create separate candidate ID
temp2$candidate_id <- 'Jeff Duncan SC 3 2022 Immigration'

# Merge with all issue statements data
issue_statements <- rbind(issue_statements, temp1, temp2)

# Save CSV
write.csv(issue_statements, 'Model_text_emphasis.csv')

